Data Exploratory

load data

load("data/samsungData.rda") # load data
dim(samsungData)
## [1] 7352  563

dimension

dimdata = dim(samsungData)
dim(samsungData)
## [1] 7352  563

list all features

  • note that column number 562 is subject (number), and 563 is class label
colnames(samsungData)
##   [1] "tBodyAcc-mean()-X"                   
##   [2] "tBodyAcc-mean()-Y"                   
##   [3] "tBodyAcc-mean()-Z"                   
##   [4] "tBodyAcc-std()-X"                    
##   [5] "tBodyAcc-std()-Y"                    
##   [6] "tBodyAcc-std()-Z"                    
##   [7] "tBodyAcc-mad()-X"                    
##   [8] "tBodyAcc-mad()-Y"                    
##   [9] "tBodyAcc-mad()-Z"                    
##  [10] "tBodyAcc-max()-X"                    
##  [11] "tBodyAcc-max()-Y"                    
##  [12] "tBodyAcc-max()-Z"                    
##  [13] "tBodyAcc-min()-X"                    
##  [14] "tBodyAcc-min()-Y"                    
##  [15] "tBodyAcc-min()-Z"                    
##  [16] "tBodyAcc-sma()"                      
##  [17] "tBodyAcc-energy()-X"                 
##  [18] "tBodyAcc-energy()-Y"                 
##  [19] "tBodyAcc-energy()-Z"                 
##  [20] "tBodyAcc-iqr()-X"                    
##  [21] "tBodyAcc-iqr()-Y"                    
##  [22] "tBodyAcc-iqr()-Z"                    
##  [23] "tBodyAcc-entropy()-X"                
##  [24] "tBodyAcc-entropy()-Y"                
##  [25] "tBodyAcc-entropy()-Z"                
##  [26] "tBodyAcc-arCoeff()-X,1"              
##  [27] "tBodyAcc-arCoeff()-X,2"              
##  [28] "tBodyAcc-arCoeff()-X,3"              
##  [29] "tBodyAcc-arCoeff()-X,4"              
##  [30] "tBodyAcc-arCoeff()-Y,1"              
##  [31] "tBodyAcc-arCoeff()-Y,2"              
##  [32] "tBodyAcc-arCoeff()-Y,3"              
##  [33] "tBodyAcc-arCoeff()-Y,4"              
##  [34] "tBodyAcc-arCoeff()-Z,1"              
##  [35] "tBodyAcc-arCoeff()-Z,2"              
##  [36] "tBodyAcc-arCoeff()-Z,3"              
##  [37] "tBodyAcc-arCoeff()-Z,4"              
##  [38] "tBodyAcc-correlation()-X,Y"          
##  [39] "tBodyAcc-correlation()-X,Z"          
##  [40] "tBodyAcc-correlation()-Y,Z"          
##  [41] "tGravityAcc-mean()-X"                
##  [42] "tGravityAcc-mean()-Y"                
##  [43] "tGravityAcc-mean()-Z"                
##  [44] "tGravityAcc-std()-X"                 
##  [45] "tGravityAcc-std()-Y"                 
##  [46] "tGravityAcc-std()-Z"                 
##  [47] "tGravityAcc-mad()-X"                 
##  [48] "tGravityAcc-mad()-Y"                 
##  [49] "tGravityAcc-mad()-Z"                 
##  [50] "tGravityAcc-max()-X"                 
##  [51] "tGravityAcc-max()-Y"                 
##  [52] "tGravityAcc-max()-Z"                 
##  [53] "tGravityAcc-min()-X"                 
##  [54] "tGravityAcc-min()-Y"                 
##  [55] "tGravityAcc-min()-Z"                 
##  [56] "tGravityAcc-sma()"                   
##  [57] "tGravityAcc-energy()-X"              
##  [58] "tGravityAcc-energy()-Y"              
##  [59] "tGravityAcc-energy()-Z"              
##  [60] "tGravityAcc-iqr()-X"                 
##  [61] "tGravityAcc-iqr()-Y"                 
##  [62] "tGravityAcc-iqr()-Z"                 
##  [63] "tGravityAcc-entropy()-X"             
##  [64] "tGravityAcc-entropy()-Y"             
##  [65] "tGravityAcc-entropy()-Z"             
##  [66] "tGravityAcc-arCoeff()-X,1"           
##  [67] "tGravityAcc-arCoeff()-X,2"           
##  [68] "tGravityAcc-arCoeff()-X,3"           
##  [69] "tGravityAcc-arCoeff()-X,4"           
##  [70] "tGravityAcc-arCoeff()-Y,1"           
##  [71] "tGravityAcc-arCoeff()-Y,2"           
##  [72] "tGravityAcc-arCoeff()-Y,3"           
##  [73] "tGravityAcc-arCoeff()-Y,4"           
##  [74] "tGravityAcc-arCoeff()-Z,1"           
##  [75] "tGravityAcc-arCoeff()-Z,2"           
##  [76] "tGravityAcc-arCoeff()-Z,3"           
##  [77] "tGravityAcc-arCoeff()-Z,4"           
##  [78] "tGravityAcc-correlation()-X,Y"       
##  [79] "tGravityAcc-correlation()-X,Z"       
##  [80] "tGravityAcc-correlation()-Y,Z"       
##  [81] "tBodyAccJerk-mean()-X"               
##  [82] "tBodyAccJerk-mean()-Y"               
##  [83] "tBodyAccJerk-mean()-Z"               
##  [84] "tBodyAccJerk-std()-X"                
##  [85] "tBodyAccJerk-std()-Y"                
##  [86] "tBodyAccJerk-std()-Z"                
##  [87] "tBodyAccJerk-mad()-X"                
##  [88] "tBodyAccJerk-mad()-Y"                
##  [89] "tBodyAccJerk-mad()-Z"                
##  [90] "tBodyAccJerk-max()-X"                
##  [91] "tBodyAccJerk-max()-Y"                
##  [92] "tBodyAccJerk-max()-Z"                
##  [93] "tBodyAccJerk-min()-X"                
##  [94] "tBodyAccJerk-min()-Y"                
##  [95] "tBodyAccJerk-min()-Z"                
##  [96] "tBodyAccJerk-sma()"                  
##  [97] "tBodyAccJerk-energy()-X"             
##  [98] "tBodyAccJerk-energy()-Y"             
##  [99] "tBodyAccJerk-energy()-Z"             
## [100] "tBodyAccJerk-iqr()-X"                
## [101] "tBodyAccJerk-iqr()-Y"                
## [102] "tBodyAccJerk-iqr()-Z"                
## [103] "tBodyAccJerk-entropy()-X"            
## [104] "tBodyAccJerk-entropy()-Y"            
## [105] "tBodyAccJerk-entropy()-Z"            
## [106] "tBodyAccJerk-arCoeff()-X,1"          
## [107] "tBodyAccJerk-arCoeff()-X,2"          
## [108] "tBodyAccJerk-arCoeff()-X,3"          
## [109] "tBodyAccJerk-arCoeff()-X,4"          
## [110] "tBodyAccJerk-arCoeff()-Y,1"          
## [111] "tBodyAccJerk-arCoeff()-Y,2"          
## [112] "tBodyAccJerk-arCoeff()-Y,3"          
## [113] "tBodyAccJerk-arCoeff()-Y,4"          
## [114] "tBodyAccJerk-arCoeff()-Z,1"          
## [115] "tBodyAccJerk-arCoeff()-Z,2"          
## [116] "tBodyAccJerk-arCoeff()-Z,3"          
## [117] "tBodyAccJerk-arCoeff()-Z,4"          
## [118] "tBodyAccJerk-correlation()-X,Y"      
## [119] "tBodyAccJerk-correlation()-X,Z"      
## [120] "tBodyAccJerk-correlation()-Y,Z"      
## [121] "tBodyGyro-mean()-X"                  
## [122] "tBodyGyro-mean()-Y"                  
## [123] "tBodyGyro-mean()-Z"                  
## [124] "tBodyGyro-std()-X"                   
## [125] "tBodyGyro-std()-Y"                   
## [126] "tBodyGyro-std()-Z"                   
## [127] "tBodyGyro-mad()-X"                   
## [128] "tBodyGyro-mad()-Y"                   
## [129] "tBodyGyro-mad()-Z"                   
## [130] "tBodyGyro-max()-X"                   
## [131] "tBodyGyro-max()-Y"                   
## [132] "tBodyGyro-max()-Z"                   
## [133] "tBodyGyro-min()-X"                   
## [134] "tBodyGyro-min()-Y"                   
## [135] "tBodyGyro-min()-Z"                   
## [136] "tBodyGyro-sma()"                     
## [137] "tBodyGyro-energy()-X"                
## [138] "tBodyGyro-energy()-Y"                
## [139] "tBodyGyro-energy()-Z"                
## [140] "tBodyGyro-iqr()-X"                   
## [141] "tBodyGyro-iqr()-Y"                   
## [142] "tBodyGyro-iqr()-Z"                   
## [143] "tBodyGyro-entropy()-X"               
## [144] "tBodyGyro-entropy()-Y"               
## [145] "tBodyGyro-entropy()-Z"               
## [146] "tBodyGyro-arCoeff()-X,1"             
## [147] "tBodyGyro-arCoeff()-X,2"             
## [148] "tBodyGyro-arCoeff()-X,3"             
## [149] "tBodyGyro-arCoeff()-X,4"             
## [150] "tBodyGyro-arCoeff()-Y,1"             
## [151] "tBodyGyro-arCoeff()-Y,2"             
## [152] "tBodyGyro-arCoeff()-Y,3"             
## [153] "tBodyGyro-arCoeff()-Y,4"             
## [154] "tBodyGyro-arCoeff()-Z,1"             
## [155] "tBodyGyro-arCoeff()-Z,2"             
## [156] "tBodyGyro-arCoeff()-Z,3"             
## [157] "tBodyGyro-arCoeff()-Z,4"             
## [158] "tBodyGyro-correlation()-X,Y"         
## [159] "tBodyGyro-correlation()-X,Z"         
## [160] "tBodyGyro-correlation()-Y,Z"         
## [161] "tBodyGyroJerk-mean()-X"              
## [162] "tBodyGyroJerk-mean()-Y"              
## [163] "tBodyGyroJerk-mean()-Z"              
## [164] "tBodyGyroJerk-std()-X"               
## [165] "tBodyGyroJerk-std()-Y"               
## [166] "tBodyGyroJerk-std()-Z"               
## [167] "tBodyGyroJerk-mad()-X"               
## [168] "tBodyGyroJerk-mad()-Y"               
## [169] "tBodyGyroJerk-mad()-Z"               
## [170] "tBodyGyroJerk-max()-X"               
## [171] "tBodyGyroJerk-max()-Y"               
## [172] "tBodyGyroJerk-max()-Z"               
## [173] "tBodyGyroJerk-min()-X"               
## [174] "tBodyGyroJerk-min()-Y"               
## [175] "tBodyGyroJerk-min()-Z"               
## [176] "tBodyGyroJerk-sma()"                 
## [177] "tBodyGyroJerk-energy()-X"            
## [178] "tBodyGyroJerk-energy()-Y"            
## [179] "tBodyGyroJerk-energy()-Z"            
## [180] "tBodyGyroJerk-iqr()-X"               
## [181] "tBodyGyroJerk-iqr()-Y"               
## [182] "tBodyGyroJerk-iqr()-Z"               
## [183] "tBodyGyroJerk-entropy()-X"           
## [184] "tBodyGyroJerk-entropy()-Y"           
## [185] "tBodyGyroJerk-entropy()-Z"           
## [186] "tBodyGyroJerk-arCoeff()-X,1"         
## [187] "tBodyGyroJerk-arCoeff()-X,2"         
## [188] "tBodyGyroJerk-arCoeff()-X,3"         
## [189] "tBodyGyroJerk-arCoeff()-X,4"         
## [190] "tBodyGyroJerk-arCoeff()-Y,1"         
## [191] "tBodyGyroJerk-arCoeff()-Y,2"         
## [192] "tBodyGyroJerk-arCoeff()-Y,3"         
## [193] "tBodyGyroJerk-arCoeff()-Y,4"         
## [194] "tBodyGyroJerk-arCoeff()-Z,1"         
## [195] "tBodyGyroJerk-arCoeff()-Z,2"         
## [196] "tBodyGyroJerk-arCoeff()-Z,3"         
## [197] "tBodyGyroJerk-arCoeff()-Z,4"         
## [198] "tBodyGyroJerk-correlation()-X,Y"     
## [199] "tBodyGyroJerk-correlation()-X,Z"     
## [200] "tBodyGyroJerk-correlation()-Y,Z"     
## [201] "tBodyAccMag-mean()"                  
## [202] "tBodyAccMag-std()"                   
## [203] "tBodyAccMag-mad()"                   
## [204] "tBodyAccMag-max()"                   
## [205] "tBodyAccMag-min()"                   
## [206] "tBodyAccMag-sma()"                   
## [207] "tBodyAccMag-energy()"                
## [208] "tBodyAccMag-iqr()"                   
## [209] "tBodyAccMag-entropy()"               
## [210] "tBodyAccMag-arCoeff()1"              
## [211] "tBodyAccMag-arCoeff()2"              
## [212] "tBodyAccMag-arCoeff()3"              
## [213] "tBodyAccMag-arCoeff()4"              
## [214] "tGravityAccMag-mean()"               
## [215] "tGravityAccMag-std()"                
## [216] "tGravityAccMag-mad()"                
## [217] "tGravityAccMag-max()"                
## [218] "tGravityAccMag-min()"                
## [219] "tGravityAccMag-sma()"                
## [220] "tGravityAccMag-energy()"             
## [221] "tGravityAccMag-iqr()"                
## [222] "tGravityAccMag-entropy()"            
## [223] "tGravityAccMag-arCoeff()1"           
## [224] "tGravityAccMag-arCoeff()2"           
## [225] "tGravityAccMag-arCoeff()3"           
## [226] "tGravityAccMag-arCoeff()4"           
## [227] "tBodyAccJerkMag-mean()"              
## [228] "tBodyAccJerkMag-std()"               
## [229] "tBodyAccJerkMag-mad()"               
## [230] "tBodyAccJerkMag-max()"               
## [231] "tBodyAccJerkMag-min()"               
## [232] "tBodyAccJerkMag-sma()"               
## [233] "tBodyAccJerkMag-energy()"            
## [234] "tBodyAccJerkMag-iqr()"               
## [235] "tBodyAccJerkMag-entropy()"           
## [236] "tBodyAccJerkMag-arCoeff()1"          
## [237] "tBodyAccJerkMag-arCoeff()2"          
## [238] "tBodyAccJerkMag-arCoeff()3"          
## [239] "tBodyAccJerkMag-arCoeff()4"          
## [240] "tBodyGyroMag-mean()"                 
## [241] "tBodyGyroMag-std()"                  
## [242] "tBodyGyroMag-mad()"                  
## [243] "tBodyGyroMag-max()"                  
## [244] "tBodyGyroMag-min()"                  
## [245] "tBodyGyroMag-sma()"                  
## [246] "tBodyGyroMag-energy()"               
## [247] "tBodyGyroMag-iqr()"                  
## [248] "tBodyGyroMag-entropy()"              
## [249] "tBodyGyroMag-arCoeff()1"             
## [250] "tBodyGyroMag-arCoeff()2"             
## [251] "tBodyGyroMag-arCoeff()3"             
## [252] "tBodyGyroMag-arCoeff()4"             
## [253] "tBodyGyroJerkMag-mean()"             
## [254] "tBodyGyroJerkMag-std()"              
## [255] "tBodyGyroJerkMag-mad()"              
## [256] "tBodyGyroJerkMag-max()"              
## [257] "tBodyGyroJerkMag-min()"              
## [258] "tBodyGyroJerkMag-sma()"              
## [259] "tBodyGyroJerkMag-energy()"           
## [260] "tBodyGyroJerkMag-iqr()"              
## [261] "tBodyGyroJerkMag-entropy()"          
## [262] "tBodyGyroJerkMag-arCoeff()1"         
## [263] "tBodyGyroJerkMag-arCoeff()2"         
## [264] "tBodyGyroJerkMag-arCoeff()3"         
## [265] "tBodyGyroJerkMag-arCoeff()4"         
## [266] "fBodyAcc-mean()-X"                   
## [267] "fBodyAcc-mean()-Y"                   
## [268] "fBodyAcc-mean()-Z"                   
## [269] "fBodyAcc-std()-X"                    
## [270] "fBodyAcc-std()-Y"                    
## [271] "fBodyAcc-std()-Z"                    
## [272] "fBodyAcc-mad()-X"                    
## [273] "fBodyAcc-mad()-Y"                    
## [274] "fBodyAcc-mad()-Z"                    
## [275] "fBodyAcc-max()-X"                    
## [276] "fBodyAcc-max()-Y"                    
## [277] "fBodyAcc-max()-Z"                    
## [278] "fBodyAcc-min()-X"                    
## [279] "fBodyAcc-min()-Y"                    
## [280] "fBodyAcc-min()-Z"                    
## [281] "fBodyAcc-sma()"                      
## [282] "fBodyAcc-energy()-X"                 
## [283] "fBodyAcc-energy()-Y"                 
## [284] "fBodyAcc-energy()-Z"                 
## [285] "fBodyAcc-iqr()-X"                    
## [286] "fBodyAcc-iqr()-Y"                    
## [287] "fBodyAcc-iqr()-Z"                    
## [288] "fBodyAcc-entropy()-X"                
## [289] "fBodyAcc-entropy()-Y"                
## [290] "fBodyAcc-entropy()-Z"                
## [291] "fBodyAcc-maxInds-X"                  
## [292] "fBodyAcc-maxInds-Y"                  
## [293] "fBodyAcc-maxInds-Z"                  
## [294] "fBodyAcc-meanFreq()-X"               
## [295] "fBodyAcc-meanFreq()-Y"               
## [296] "fBodyAcc-meanFreq()-Z"               
## [297] "fBodyAcc-skewness()-X"               
## [298] "fBodyAcc-kurtosis()-X"               
## [299] "fBodyAcc-skewness()-Y"               
## [300] "fBodyAcc-kurtosis()-Y"               
## [301] "fBodyAcc-skewness()-Z"               
## [302] "fBodyAcc-kurtosis()-Z"               
## [303] "fBodyAcc-bandsEnergy()-1,8"          
## [304] "fBodyAcc-bandsEnergy()-9,16"         
## [305] "fBodyAcc-bandsEnergy()-17,24"        
## [306] "fBodyAcc-bandsEnergy()-25,32"        
## [307] "fBodyAcc-bandsEnergy()-33,40"        
## [308] "fBodyAcc-bandsEnergy()-41,48"        
## [309] "fBodyAcc-bandsEnergy()-49,56"        
## [310] "fBodyAcc-bandsEnergy()-57,64"        
## [311] "fBodyAcc-bandsEnergy()-1,16"         
## [312] "fBodyAcc-bandsEnergy()-17,32"        
## [313] "fBodyAcc-bandsEnergy()-33,48"        
## [314] "fBodyAcc-bandsEnergy()-49,64"        
## [315] "fBodyAcc-bandsEnergy()-1,24"         
## [316] "fBodyAcc-bandsEnergy()-25,48"        
## [317] "fBodyAcc-bandsEnergy()-1,8"          
## [318] "fBodyAcc-bandsEnergy()-9,16"         
## [319] "fBodyAcc-bandsEnergy()-17,24"        
## [320] "fBodyAcc-bandsEnergy()-25,32"        
## [321] "fBodyAcc-bandsEnergy()-33,40"        
## [322] "fBodyAcc-bandsEnergy()-41,48"        
## [323] "fBodyAcc-bandsEnergy()-49,56"        
## [324] "fBodyAcc-bandsEnergy()-57,64"        
## [325] "fBodyAcc-bandsEnergy()-1,16"         
## [326] "fBodyAcc-bandsEnergy()-17,32"        
## [327] "fBodyAcc-bandsEnergy()-33,48"        
## [328] "fBodyAcc-bandsEnergy()-49,64"        
## [329] "fBodyAcc-bandsEnergy()-1,24"         
## [330] "fBodyAcc-bandsEnergy()-25,48"        
## [331] "fBodyAcc-bandsEnergy()-1,8"          
## [332] "fBodyAcc-bandsEnergy()-9,16"         
## [333] "fBodyAcc-bandsEnergy()-17,24"        
## [334] "fBodyAcc-bandsEnergy()-25,32"        
## [335] "fBodyAcc-bandsEnergy()-33,40"        
## [336] "fBodyAcc-bandsEnergy()-41,48"        
## [337] "fBodyAcc-bandsEnergy()-49,56"        
## [338] "fBodyAcc-bandsEnergy()-57,64"        
## [339] "fBodyAcc-bandsEnergy()-1,16"         
## [340] "fBodyAcc-bandsEnergy()-17,32"        
## [341] "fBodyAcc-bandsEnergy()-33,48"        
## [342] "fBodyAcc-bandsEnergy()-49,64"        
## [343] "fBodyAcc-bandsEnergy()-1,24"         
## [344] "fBodyAcc-bandsEnergy()-25,48"        
## [345] "fBodyAccJerk-mean()-X"               
## [346] "fBodyAccJerk-mean()-Y"               
## [347] "fBodyAccJerk-mean()-Z"               
## [348] "fBodyAccJerk-std()-X"                
## [349] "fBodyAccJerk-std()-Y"                
## [350] "fBodyAccJerk-std()-Z"                
## [351] "fBodyAccJerk-mad()-X"                
## [352] "fBodyAccJerk-mad()-Y"                
## [353] "fBodyAccJerk-mad()-Z"                
## [354] "fBodyAccJerk-max()-X"                
## [355] "fBodyAccJerk-max()-Y"                
## [356] "fBodyAccJerk-max()-Z"                
## [357] "fBodyAccJerk-min()-X"                
## [358] "fBodyAccJerk-min()-Y"                
## [359] "fBodyAccJerk-min()-Z"                
## [360] "fBodyAccJerk-sma()"                  
## [361] "fBodyAccJerk-energy()-X"             
## [362] "fBodyAccJerk-energy()-Y"             
## [363] "fBodyAccJerk-energy()-Z"             
## [364] "fBodyAccJerk-iqr()-X"                
## [365] "fBodyAccJerk-iqr()-Y"                
## [366] "fBodyAccJerk-iqr()-Z"                
## [367] "fBodyAccJerk-entropy()-X"            
## [368] "fBodyAccJerk-entropy()-Y"            
## [369] "fBodyAccJerk-entropy()-Z"            
## [370] "fBodyAccJerk-maxInds-X"              
## [371] "fBodyAccJerk-maxInds-Y"              
## [372] "fBodyAccJerk-maxInds-Z"              
## [373] "fBodyAccJerk-meanFreq()-X"           
## [374] "fBodyAccJerk-meanFreq()-Y"           
## [375] "fBodyAccJerk-meanFreq()-Z"           
## [376] "fBodyAccJerk-skewness()-X"           
## [377] "fBodyAccJerk-kurtosis()-X"           
## [378] "fBodyAccJerk-skewness()-Y"           
## [379] "fBodyAccJerk-kurtosis()-Y"           
## [380] "fBodyAccJerk-skewness()-Z"           
## [381] "fBodyAccJerk-kurtosis()-Z"           
## [382] "fBodyAccJerk-bandsEnergy()-1,8"      
## [383] "fBodyAccJerk-bandsEnergy()-9,16"     
## [384] "fBodyAccJerk-bandsEnergy()-17,24"    
## [385] "fBodyAccJerk-bandsEnergy()-25,32"    
## [386] "fBodyAccJerk-bandsEnergy()-33,40"    
## [387] "fBodyAccJerk-bandsEnergy()-41,48"    
## [388] "fBodyAccJerk-bandsEnergy()-49,56"    
## [389] "fBodyAccJerk-bandsEnergy()-57,64"    
## [390] "fBodyAccJerk-bandsEnergy()-1,16"     
## [391] "fBodyAccJerk-bandsEnergy()-17,32"    
## [392] "fBodyAccJerk-bandsEnergy()-33,48"    
## [393] "fBodyAccJerk-bandsEnergy()-49,64"    
## [394] "fBodyAccJerk-bandsEnergy()-1,24"     
## [395] "fBodyAccJerk-bandsEnergy()-25,48"    
## [396] "fBodyAccJerk-bandsEnergy()-1,8"      
## [397] "fBodyAccJerk-bandsEnergy()-9,16"     
## [398] "fBodyAccJerk-bandsEnergy()-17,24"    
## [399] "fBodyAccJerk-bandsEnergy()-25,32"    
## [400] "fBodyAccJerk-bandsEnergy()-33,40"    
## [401] "fBodyAccJerk-bandsEnergy()-41,48"    
## [402] "fBodyAccJerk-bandsEnergy()-49,56"    
## [403] "fBodyAccJerk-bandsEnergy()-57,64"    
## [404] "fBodyAccJerk-bandsEnergy()-1,16"     
## [405] "fBodyAccJerk-bandsEnergy()-17,32"    
## [406] "fBodyAccJerk-bandsEnergy()-33,48"    
## [407] "fBodyAccJerk-bandsEnergy()-49,64"    
## [408] "fBodyAccJerk-bandsEnergy()-1,24"     
## [409] "fBodyAccJerk-bandsEnergy()-25,48"    
## [410] "fBodyAccJerk-bandsEnergy()-1,8"      
## [411] "fBodyAccJerk-bandsEnergy()-9,16"     
## [412] "fBodyAccJerk-bandsEnergy()-17,24"    
## [413] "fBodyAccJerk-bandsEnergy()-25,32"    
## [414] "fBodyAccJerk-bandsEnergy()-33,40"    
## [415] "fBodyAccJerk-bandsEnergy()-41,48"    
## [416] "fBodyAccJerk-bandsEnergy()-49,56"    
## [417] "fBodyAccJerk-bandsEnergy()-57,64"    
## [418] "fBodyAccJerk-bandsEnergy()-1,16"     
## [419] "fBodyAccJerk-bandsEnergy()-17,32"    
## [420] "fBodyAccJerk-bandsEnergy()-33,48"    
## [421] "fBodyAccJerk-bandsEnergy()-49,64"    
## [422] "fBodyAccJerk-bandsEnergy()-1,24"     
## [423] "fBodyAccJerk-bandsEnergy()-25,48"    
## [424] "fBodyGyro-mean()-X"                  
## [425] "fBodyGyro-mean()-Y"                  
## [426] "fBodyGyro-mean()-Z"                  
## [427] "fBodyGyro-std()-X"                   
## [428] "fBodyGyro-std()-Y"                   
## [429] "fBodyGyro-std()-Z"                   
## [430] "fBodyGyro-mad()-X"                   
## [431] "fBodyGyro-mad()-Y"                   
## [432] "fBodyGyro-mad()-Z"                   
## [433] "fBodyGyro-max()-X"                   
## [434] "fBodyGyro-max()-Y"                   
## [435] "fBodyGyro-max()-Z"                   
## [436] "fBodyGyro-min()-X"                   
## [437] "fBodyGyro-min()-Y"                   
## [438] "fBodyGyro-min()-Z"                   
## [439] "fBodyGyro-sma()"                     
## [440] "fBodyGyro-energy()-X"                
## [441] "fBodyGyro-energy()-Y"                
## [442] "fBodyGyro-energy()-Z"                
## [443] "fBodyGyro-iqr()-X"                   
## [444] "fBodyGyro-iqr()-Y"                   
## [445] "fBodyGyro-iqr()-Z"                   
## [446] "fBodyGyro-entropy()-X"               
## [447] "fBodyGyro-entropy()-Y"               
## [448] "fBodyGyro-entropy()-Z"               
## [449] "fBodyGyro-maxInds-X"                 
## [450] "fBodyGyro-maxInds-Y"                 
## [451] "fBodyGyro-maxInds-Z"                 
## [452] "fBodyGyro-meanFreq()-X"              
## [453] "fBodyGyro-meanFreq()-Y"              
## [454] "fBodyGyro-meanFreq()-Z"              
## [455] "fBodyGyro-skewness()-X"              
## [456] "fBodyGyro-kurtosis()-X"              
## [457] "fBodyGyro-skewness()-Y"              
## [458] "fBodyGyro-kurtosis()-Y"              
## [459] "fBodyGyro-skewness()-Z"              
## [460] "fBodyGyro-kurtosis()-Z"              
## [461] "fBodyGyro-bandsEnergy()-1,8"         
## [462] "fBodyGyro-bandsEnergy()-9,16"        
## [463] "fBodyGyro-bandsEnergy()-17,24"       
## [464] "fBodyGyro-bandsEnergy()-25,32"       
## [465] "fBodyGyro-bandsEnergy()-33,40"       
## [466] "fBodyGyro-bandsEnergy()-41,48"       
## [467] "fBodyGyro-bandsEnergy()-49,56"       
## [468] "fBodyGyro-bandsEnergy()-57,64"       
## [469] "fBodyGyro-bandsEnergy()-1,16"        
## [470] "fBodyGyro-bandsEnergy()-17,32"       
## [471] "fBodyGyro-bandsEnergy()-33,48"       
## [472] "fBodyGyro-bandsEnergy()-49,64"       
## [473] "fBodyGyro-bandsEnergy()-1,24"        
## [474] "fBodyGyro-bandsEnergy()-25,48"       
## [475] "fBodyGyro-bandsEnergy()-1,8"         
## [476] "fBodyGyro-bandsEnergy()-9,16"        
## [477] "fBodyGyro-bandsEnergy()-17,24"       
## [478] "fBodyGyro-bandsEnergy()-25,32"       
## [479] "fBodyGyro-bandsEnergy()-33,40"       
## [480] "fBodyGyro-bandsEnergy()-41,48"       
## [481] "fBodyGyro-bandsEnergy()-49,56"       
## [482] "fBodyGyro-bandsEnergy()-57,64"       
## [483] "fBodyGyro-bandsEnergy()-1,16"        
## [484] "fBodyGyro-bandsEnergy()-17,32"       
## [485] "fBodyGyro-bandsEnergy()-33,48"       
## [486] "fBodyGyro-bandsEnergy()-49,64"       
## [487] "fBodyGyro-bandsEnergy()-1,24"        
## [488] "fBodyGyro-bandsEnergy()-25,48"       
## [489] "fBodyGyro-bandsEnergy()-1,8"         
## [490] "fBodyGyro-bandsEnergy()-9,16"        
## [491] "fBodyGyro-bandsEnergy()-17,24"       
## [492] "fBodyGyro-bandsEnergy()-25,32"       
## [493] "fBodyGyro-bandsEnergy()-33,40"       
## [494] "fBodyGyro-bandsEnergy()-41,48"       
## [495] "fBodyGyro-bandsEnergy()-49,56"       
## [496] "fBodyGyro-bandsEnergy()-57,64"       
## [497] "fBodyGyro-bandsEnergy()-1,16"        
## [498] "fBodyGyro-bandsEnergy()-17,32"       
## [499] "fBodyGyro-bandsEnergy()-33,48"       
## [500] "fBodyGyro-bandsEnergy()-49,64"       
## [501] "fBodyGyro-bandsEnergy()-1,24"        
## [502] "fBodyGyro-bandsEnergy()-25,48"       
## [503] "fBodyAccMag-mean()"                  
## [504] "fBodyAccMag-std()"                   
## [505] "fBodyAccMag-mad()"                   
## [506] "fBodyAccMag-max()"                   
## [507] "fBodyAccMag-min()"                   
## [508] "fBodyAccMag-sma()"                   
## [509] "fBodyAccMag-energy()"                
## [510] "fBodyAccMag-iqr()"                   
## [511] "fBodyAccMag-entropy()"               
## [512] "fBodyAccMag-maxInds"                 
## [513] "fBodyAccMag-meanFreq()"              
## [514] "fBodyAccMag-skewness()"              
## [515] "fBodyAccMag-kurtosis()"              
## [516] "fBodyBodyAccJerkMag-mean()"          
## [517] "fBodyBodyAccJerkMag-std()"           
## [518] "fBodyBodyAccJerkMag-mad()"           
## [519] "fBodyBodyAccJerkMag-max()"           
## [520] "fBodyBodyAccJerkMag-min()"           
## [521] "fBodyBodyAccJerkMag-sma()"           
## [522] "fBodyBodyAccJerkMag-energy()"        
## [523] "fBodyBodyAccJerkMag-iqr()"           
## [524] "fBodyBodyAccJerkMag-entropy()"       
## [525] "fBodyBodyAccJerkMag-maxInds"         
## [526] "fBodyBodyAccJerkMag-meanFreq()"      
## [527] "fBodyBodyAccJerkMag-skewness()"      
## [528] "fBodyBodyAccJerkMag-kurtosis()"      
## [529] "fBodyBodyGyroMag-mean()"             
## [530] "fBodyBodyGyroMag-std()"              
## [531] "fBodyBodyGyroMag-mad()"              
## [532] "fBodyBodyGyroMag-max()"              
## [533] "fBodyBodyGyroMag-min()"              
## [534] "fBodyBodyGyroMag-sma()"              
## [535] "fBodyBodyGyroMag-energy()"           
## [536] "fBodyBodyGyroMag-iqr()"              
## [537] "fBodyBodyGyroMag-entropy()"          
## [538] "fBodyBodyGyroMag-maxInds"            
## [539] "fBodyBodyGyroMag-meanFreq()"         
## [540] "fBodyBodyGyroMag-skewness()"         
## [541] "fBodyBodyGyroMag-kurtosis()"         
## [542] "fBodyBodyGyroJerkMag-mean()"         
## [543] "fBodyBodyGyroJerkMag-std()"          
## [544] "fBodyBodyGyroJerkMag-mad()"          
## [545] "fBodyBodyGyroJerkMag-max()"          
## [546] "fBodyBodyGyroJerkMag-min()"          
## [547] "fBodyBodyGyroJerkMag-sma()"          
## [548] "fBodyBodyGyroJerkMag-energy()"       
## [549] "fBodyBodyGyroJerkMag-iqr()"          
## [550] "fBodyBodyGyroJerkMag-entropy()"      
## [551] "fBodyBodyGyroJerkMag-maxInds"        
## [552] "fBodyBodyGyroJerkMag-meanFreq()"     
## [553] "fBodyBodyGyroJerkMag-skewness()"     
## [554] "fBodyBodyGyroJerkMag-kurtosis()"     
## [555] "angle(tBodyAccMean,gravity)"         
## [556] "angle(tBodyAccJerkMean),gravityMean)"
## [557] "angle(tBodyGyroMean,gravityMean)"    
## [558] "angle(tBodyGyroJerkMean,gravityMean)"
## [559] "angle(X,gravityMean)"                
## [560] "angle(Y,gravityMean)"                
## [561] "angle(Z,gravityMean)"                
## [562] "subject"                             
## [563] "activity"

factorize the class labels

samsungData[,dimdata[2]] = factor(samsungData[,dim(samsungData)[2]])
# check if is factor
is.factor(samsungData[,dimdata[2]])
## [1] TRUE

Fix duplicated column names

duplicated_index = which(duplicated(colnames(samsungData)))
duplicated_columns =unique(colnames(samsungData)[(duplicated(colnames(samsungData)))])

samsungData.new = samsungData[,!(duplicated(colnames(samsungData)))]
dim(samsungData.new)
## [1] 7352  479
## adding .index to each duplicated column
for (each in duplicated_columns){
  ind = which(colnames(samsungData)==each)
  colnames(samsungData)[ind]= unlist(lapply(1:length(ind), function(i) paste(colnames(samsungData)[ind[i]], i, sep=".") ))
}
# view duplicated columns names
colnames(samsungData)[duplicated_index]
##  [1] "fBodyAcc-bandsEnergy()-1,8.2"      
##  [2] "fBodyAcc-bandsEnergy()-9,16.2"     
##  [3] "fBodyAcc-bandsEnergy()-17,24.2"    
##  [4] "fBodyAcc-bandsEnergy()-25,32.2"    
##  [5] "fBodyAcc-bandsEnergy()-33,40.2"    
##  [6] "fBodyAcc-bandsEnergy()-41,48.2"    
##  [7] "fBodyAcc-bandsEnergy()-49,56.2"    
##  [8] "fBodyAcc-bandsEnergy()-57,64.2"    
##  [9] "fBodyAcc-bandsEnergy()-1,16.2"     
## [10] "fBodyAcc-bandsEnergy()-17,32.2"    
## [11] "fBodyAcc-bandsEnergy()-33,48.2"    
## [12] "fBodyAcc-bandsEnergy()-49,64.2"    
## [13] "fBodyAcc-bandsEnergy()-1,24.2"     
## [14] "fBodyAcc-bandsEnergy()-25,48.2"    
## [15] "fBodyAcc-bandsEnergy()-1,8.3"      
## [16] "fBodyAcc-bandsEnergy()-9,16.3"     
## [17] "fBodyAcc-bandsEnergy()-17,24.3"    
## [18] "fBodyAcc-bandsEnergy()-25,32.3"    
## [19] "fBodyAcc-bandsEnergy()-33,40.3"    
## [20] "fBodyAcc-bandsEnergy()-41,48.3"    
## [21] "fBodyAcc-bandsEnergy()-49,56.3"    
## [22] "fBodyAcc-bandsEnergy()-57,64.3"    
## [23] "fBodyAcc-bandsEnergy()-1,16.3"     
## [24] "fBodyAcc-bandsEnergy()-17,32.3"    
## [25] "fBodyAcc-bandsEnergy()-33,48.3"    
## [26] "fBodyAcc-bandsEnergy()-49,64.3"    
## [27] "fBodyAcc-bandsEnergy()-1,24.3"     
## [28] "fBodyAcc-bandsEnergy()-25,48.3"    
## [29] "fBodyAccJerk-bandsEnergy()-1,8.2"  
## [30] "fBodyAccJerk-bandsEnergy()-9,16.2" 
## [31] "fBodyAccJerk-bandsEnergy()-17,24.2"
## [32] "fBodyAccJerk-bandsEnergy()-25,32.2"
## [33] "fBodyAccJerk-bandsEnergy()-33,40.2"
## [34] "fBodyAccJerk-bandsEnergy()-41,48.2"
## [35] "fBodyAccJerk-bandsEnergy()-49,56.2"
## [36] "fBodyAccJerk-bandsEnergy()-57,64.2"
## [37] "fBodyAccJerk-bandsEnergy()-1,16.2" 
## [38] "fBodyAccJerk-bandsEnergy()-17,32.2"
## [39] "fBodyAccJerk-bandsEnergy()-33,48.2"
## [40] "fBodyAccJerk-bandsEnergy()-49,64.2"
## [41] "fBodyAccJerk-bandsEnergy()-1,24.2" 
## [42] "fBodyAccJerk-bandsEnergy()-25,48.2"
## [43] "fBodyAccJerk-bandsEnergy()-1,8.3"  
## [44] "fBodyAccJerk-bandsEnergy()-9,16.3" 
## [45] "fBodyAccJerk-bandsEnergy()-17,24.3"
## [46] "fBodyAccJerk-bandsEnergy()-25,32.3"
## [47] "fBodyAccJerk-bandsEnergy()-33,40.3"
## [48] "fBodyAccJerk-bandsEnergy()-41,48.3"
## [49] "fBodyAccJerk-bandsEnergy()-49,56.3"
## [50] "fBodyAccJerk-bandsEnergy()-57,64.3"
## [51] "fBodyAccJerk-bandsEnergy()-1,16.3" 
## [52] "fBodyAccJerk-bandsEnergy()-17,32.3"
## [53] "fBodyAccJerk-bandsEnergy()-33,48.3"
## [54] "fBodyAccJerk-bandsEnergy()-49,64.3"
## [55] "fBodyAccJerk-bandsEnergy()-1,24.3" 
## [56] "fBodyAccJerk-bandsEnergy()-25,48.3"
## [57] "fBodyGyro-bandsEnergy()-1,8.2"     
## [58] "fBodyGyro-bandsEnergy()-9,16.2"    
## [59] "fBodyGyro-bandsEnergy()-17,24.2"   
## [60] "fBodyGyro-bandsEnergy()-25,32.2"   
## [61] "fBodyGyro-bandsEnergy()-33,40.2"   
## [62] "fBodyGyro-bandsEnergy()-41,48.2"   
## [63] "fBodyGyro-bandsEnergy()-49,56.2"   
## [64] "fBodyGyro-bandsEnergy()-57,64.2"   
## [65] "fBodyGyro-bandsEnergy()-1,16.2"    
## [66] "fBodyGyro-bandsEnergy()-17,32.2"   
## [67] "fBodyGyro-bandsEnergy()-33,48.2"   
## [68] "fBodyGyro-bandsEnergy()-49,64.2"   
## [69] "fBodyGyro-bandsEnergy()-1,24.2"    
## [70] "fBodyGyro-bandsEnergy()-25,48.2"   
## [71] "fBodyGyro-bandsEnergy()-1,8.3"     
## [72] "fBodyGyro-bandsEnergy()-9,16.3"    
## [73] "fBodyGyro-bandsEnergy()-17,24.3"   
## [74] "fBodyGyro-bandsEnergy()-25,32.3"   
## [75] "fBodyGyro-bandsEnergy()-33,40.3"   
## [76] "fBodyGyro-bandsEnergy()-41,48.3"   
## [77] "fBodyGyro-bandsEnergy()-49,56.3"   
## [78] "fBodyGyro-bandsEnergy()-57,64.3"   
## [79] "fBodyGyro-bandsEnergy()-1,16.3"    
## [80] "fBodyGyro-bandsEnergy()-17,32.3"   
## [81] "fBodyGyro-bandsEnergy()-33,48.3"   
## [82] "fBodyGyro-bandsEnergy()-49,64.3"   
## [83] "fBodyGyro-bandsEnergy()-1,24.3"    
## [84] "fBodyGyro-bandsEnergy()-25,48.3"
write(duplicated_index, "duplicated_column_indices.txt", sep="\n")

summary of class label

table(samsungData[, dimdata[2]])
## 
##   laying  sitting standing     walk walkdown   walkup 
##     1407     1286     1374     1226      986     1073

summary of subject numbers

table(samsungData[, dimdata[2]-1])
## 
##   1   3   5   6   7   8  11  14  15  16  17  19  21  22  23  25  26  27 
## 347 341 302 325 308 281 316 323 328 366 368 360 408 321 372 409 392 376 
##  28  29  30 
## 382 344 383
barplot(summary(factor(samsungData[, dimdata[2]-1])), cex.names =0.6)
title("number of data points for each subject")

summary of class labels per subject

my_df = table(samsungData$subject, samsungData$activity)
par(cex.main = 1)
plot(my_df, main="")
title(main="distribution of class labels per subject", outer = FALSE)

barplot(t(as.matrix(my_df)), beside = TRUE, main = "number of datapoints for each activity per subject", col = c(3,4,6,7,3,1))
legend("topright", colnames(my_df), col = c(3,4,6,7,3,1), lty = 1, lwd = 2, cex = 0.5)

check balance
boxplot.matrix(t(my_df), xlab="subject", ylab="count", main="Number of instances per activity for each subject")
lb  = c(names(which.max(my_df['1',])),names(which.max(my_df['30',])))
text(c(1,21), c(max(my_df['1',]), max(my_df['30',])), lb, col="red", pos=c(4,3), cex=0.9)

  • ggplot version
## ggplot2
library(reshape2)
library(ggplot2)
melted = melt(my_df)
colnames(melted)[1:2] = c("subject","activity")
lab = c(names(which.max(my_df["1",])),names(which.max(my_df["15",])),names(which.max(my_df["30",])))
where = c(max(my_df["1",]),max(my_df["15",]), max(my_df["30",]))
ggplot(melted, aes(factor(subject), value))+geom_boxplot()+labs(title = "Number of Instances Per Activity For Each Subject", x="subject", y="count")+geom_text(aes("1",where[1]+2), label=lab[1], color="blue")+geom_text(aes("30",where[3]+2), label=lab[3], color="blue")+geom_text(aes("15",where[2]+2), label=lab[2], color="blue")

ANOVA

  • p val > 0.5. Therefore, cannot reject H0 that means are equal at alpha = 0.05
  • therefore, we decided to split train and test groups by subjects
my_df_aov = aov(value~factor(subject),data = melted)
summary(my_df_aov)
##                  Df Sum Sq Mean Sq F value Pr(>F)  
## factor(subject)  20   4225   211.3   1.602 0.0657 .
## Residuals       105  13849   131.9                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

boxplot

  • see the range of data for each feature
last_feature_index = dimdata[2]-2
for (i in seq(1,last_feature_index, 50)) {
  if (i+50 < last_feature_index) {
    end = i + 50
  } else {
    end =last_feature_index
  }
  boxplot(samsungData[,i:end], main=paste("[",i, ",",(i+50), "]"))
}

#### boxplot

boxplot.matrix(my_df)

variance across samples per feature

par(mfrow=c(1,1))
var_per_feature = apply(samsungData[,1:561], 2, var)

barplot(var_per_feature, axisnames = F, cex.names = 0.8, cex.axis = 0.8, xlab = "feature", main = "Variances")

mean_per_feature = apply(samsungData[,1:561], 2, mean)

barplot(mean_per_feature, axisnames = F, cex.names = 0.8, cex.axis = 0.8, xlab = "feature", main = "Means")

cv_per_feature = apply(samsungData[,1:561], 2, function(x) sd(x)/mean(x))

barplot(cv_per_feature, axisnames = F, cex.names = 0.8, cex.axis = 0.8, xlab = "feature", main = "CVs")

abs_cv_per_feature = apply(samsungData[,1:561], 2, function(x) abs(sd(x)/mean(x)))

barplot(abs_cv_per_feature, axisnames = F, cex.names = 0.8, cex.axis = 0.8, xlab = "feature", main = "Absolute CVs")

sorted_variances = var_per_feature[order(var_per_feature, decreasing = T)] 

sorted_abs_cvs = abs_cv_per_feature[order(abs_cv_per_feature, decreasing = T)] 

# pick top 5
top_5_sorted_variances = sorted_variances[1:5]
top_5_sorted_abs_cvs = sorted_abs_cvs[1:5]
names_top_5_sorted_variances = names(top_5_sorted_variances)
names_top_5_sorted_abs_cvs = names(top_5_sorted_abs_cvs )

Split data into training and test set

  • split by subjects
  • ramdomly select 80% for train and 20% for test
split_data = function (data, seed) {
  set.seed(seed) # set seeed for reproducibility
  subject_list = attributes(factor(data[,dim(data)[2]-1]))$levels
  training_ratio = 0.8
  n_train = floor(training_ratio*length(subject_list))
  trainning_subjects = sample(subject_list, n_train, replace = FALSE)
  trainning_indices <<- which( data[,"subject"]%in%trainning_subjects)
  test_subjects = subject_list[!(subject_list%in%trainning_indices)]
  
  
  test_indices <<- which(!data[,"subject"]%in%trainning_subjects)
  
  write.table(data, file = "samsungData_fixed-duplicated-columns.csv", row.names = FALSE, col.names = TRUE, sep = "," )
  
  write.table(data[trainning_indices,], file = "samsungData_fixed-duplicated-columns.train.csv", row.names = FALSE, col.names = TRUE, sep = "," )
  
  write.table(data[test_indices,], file = "samsungData_fixed-duplicated-columns.test.csv", row.names = FALSE, col.names = TRUE, sep = "," )
  
}
split_data(data = samsungData, seed = 123)

Feature Selection

Train

  • using Random forest in h2o.ai package
  • Does random forest perform worse with the normalized data?
  • 10-fold cross-validation
  • use seed = 123 for reproducibility reason
# The following two commands remove any previously installed H2O packages for R.
if ("package:h2o" %in% search()) { detach("package:h2o", unload=TRUE) }
if ("h2o" %in% rownames(installed.packages())) { remove.packages("h2o") }

# Next, we download packages that H2O depends on.
if (! ("methods" %in% rownames(installed.packages()))) { install.packages("methods") }
if (! ("statmod" %in% rownames(installed.packages()))) { install.packages("statmod") }
if (! ("stats" %in% rownames(installed.packages()))) { install.packages("stats") }
if (! ("graphics" %in% rownames(installed.packages()))) { install.packages("graphics") }
if (! ("RCurl" %in% rownames(installed.packages()))) { install.packages("RCurl") }
if (! ("jsonlite" %in% rownames(installed.packages()))) { install.packages("jsonlite") }
if (! ("tools" %in% rownames(installed.packages()))) { install.packages("tools") }
if (! ("utils" %in% rownames(installed.packages()))) { install.packages("utils") }

# h2o_3.8.2.6
install.packages("h2o", type="source", repos=(c("https://h2o-release.s3.amazonaws.com/h2o/rel-turchin/6/R")))

load h2o module and start h2o node in the local machine

## load modules and start h2o compute node
library(h2o)
## Loading required package: statmod
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
## 
##     sd, var
## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
localH2O = h2o.init(ip = "localhost",  startH2O = TRUE)
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         19 hours 27 minutes 
##     H2O cluster version:        3.8.2.6 
##     H2O cluster name:           H2O_started_from_R_tkhunkhe_dzx670 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.26 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  2 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     R Version:                  R version 3.3.0 (2016-05-03)
# upload file to h2o 
samsungData.hex = h2o.uploadFile(path = "samsungData_fixed-duplicated-columns.train.csv")
dim(samsungData.hex)
ncol = dim(samsungData.hex)[2]
x = colnames(samsungData.hex)[-((ncol-1):ncol)]
y = colnames(samsungData.hex)[ncol]

# classification with random forest, and get the top most important features used
list_models = read.table("model_id.txt", sep="\t", header = FALSE,col.names = c('model', 'model_id'), stringsAsFactors = FALSE)
rownames(list_models) = list_models[,1]
list_models = list_models[-1]
tryCatch ( {
  model <<- h2o.getModel(list_models['full model',])},  # <<- save to global 
  error=function(e) {
    model <<- h2o.randomForest(x, y, seed = 123, samsungData.hex, nfolds = 10)
    write(paste("full model",model@model_id, sep="\t"), "model_id.txt", append = FALSE)} )

try no cross-validation

model_no_cross_validate = h2o.randomForest(x, y, seed = 123, samsungData.hex)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |====                                                             |   6%
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |============                                                     |  18%
  |                                                                       
  |==============                                                   |  22%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |=========================                                        |  38%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |==============================                                   |  46%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |=================================================                |  76%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |================================================================ |  98%
  |                                                                       
  |=================================================================| 100%
  • cross-validated model
vars_from_cv = h2o.varimp(model)[1:5,'variable']
  • no-cross-validated model
vars_from_no_cv = h2o.varimp(model_no_cross_validate)[1:5,'variable']
identical(vars_from_cv, vars_from_no_cv) # true
## [1] TRUE
compare runtime
model_no_cross_validate @model$run_time # 18 sec
## [1] 25527
model@model$run_time # 25 sec
## [1] 25518

baseline accuracy

confusion matrix
h2o.confusionMatrix(model)
## Confusion Matrix: vertical: actual; across: predicted
##          laying sitting standing walk walkdown walkup  Error          Rate
## laying     1092       0        0    0        0      0 0.0000 =   0 / 1,092
## sitting       0     955       59    0        0      0 0.0582 =  59 / 1,014
## standing      0      21     1063    0        0      0 0.0194 =  21 / 1,084
## walk          0       1        2  942        5      5 0.0136 =    13 / 955
## walkdown      0       0        0    4      753      7 0.0144 =    11 / 764
## walkup        0       0        0    1        1    824 0.0024 =     2 / 826
## Totals     1092     977     1124  947      759    836 0.0185 = 106 / 5,735
total accuracy
1 - h2o.confusionMatrix(model)['Totals','Error']
## [1] 0.981517

accuracy across cv

baseline_accuracy_cv = model@model$cross_validation_metrics_summary[,c('mean', 'sd')]
baseline_accuracy = baseline_accuracy_cv['accuracy',]

Hit ratio

  • what is it?
check varimp across cv
  • The result confirmed that the algorithm selected the variables that are most common across 10-fold cross-validation
compared_top_imp_vars =matrix(nrow=5, ncol=0)
for (m in h2o.cross_validation_models(model)){
  compared_top_imp_vars = cbind(compared_top_imp_vars, h2o.varimp(m)$variable[1:5])
}
compared_top_imp_vars = cbind(compared_top_imp_vars,h2o.varimp(model)$variable[1:5])
colnames(compared_top_imp_vars) = c(1:10, "selected")
rownames(compared_top_imp_vars) = paste("var",c(1:5), sep="")
t(as.data.frame(compared_top_imp_vars))
##          var1                   var2                    
## 1        "tGravityAcc-mean()-Y" "tGravityAcc-min()-X"   
## 2        "tGravityAcc-mean()-Y" "angle(X,gravityMean)"  
## 3        "tGravityAcc-mean()-Y" "tGravityAcc-energy()-X"
## 4        "tGravityAcc-mean()-Y" "angle(X,gravityMean)"  
## 5        "tGravityAcc-mean()-Y" "angle(X,gravityMean)"  
## 6        "tGravityAcc-mean()-Y" "angle(X,gravityMean)"  
## 7        "tGravityAcc-mean()-Y" "angle(X,gravityMean)"  
## 8        "tGravityAcc-mean()-Y" "angle(X,gravityMean)"  
## 9        "tGravityAcc-mean()-Y" "angle(X,gravityMean)"  
## 10       "tGravityAcc-mean()-Y" "tGravityAcc-min()-X"   
## selected "tGravityAcc-mean()-Y" "tGravityAcc-min()-X"   
##          var3                   var4                    
## 1        "angle(X,gravityMean)" "tGravityAcc-energy()-X"
## 2        "tGravityAcc-min()-X"  "tGravityAcc-energy()-X"
## 3        "angle(X,gravityMean)" "tGravityAcc-min()-X"   
## 4        "tGravityAcc-min()-X"  "tGravityAcc-energy()-X"
## 5        "tGravityAcc-min()-X"  "tGravityAcc-energy()-X"
## 6        "tGravityAcc-min()-X"  "tGravityAcc-energy()-X"
## 7        "tGravityAcc-min()-X"  "tGravityAcc-energy()-X"
## 8        "tGravityAcc-min()-X"  "tGravityAcc-energy()-X"
## 9        "tGravityAcc-min()-X"  "tGravityAcc-energy()-X"
## 10       "angle(X,gravityMean)" "tGravityAcc-energy()-X"
## selected "angle(X,gravityMean)" "tGravityAcc-energy()-X"
##          var5                  
## 1        "tGravityAcc-min()-Z" 
## 2        "tGravityAcc-max()-Y" 
## 3        "tGravityAcc-min()-Z" 
## 4        "angle(Y,gravityMean)"
## 5        "tGravityAcc-max()-Y" 
## 6        "tGravityAcc-min()-Z" 
## 7        "angle(Y,gravityMean)"
## 8        "tGravityAcc-max()-Y" 
## 9        "tGravityAcc-max()-Y" 
## 10       "angle(Y,gravityMean)"
## selected "tGravityAcc-max()-Y"
select top features for to train classification models (random forest)
# select top features
top5_important_feature = model@model$variable_importances$variable[1:5]
top4_important_feature = model@model$variable_importances$variable[1:4]
top3_important_feature = model@model$variable_importances$variable[1:3]
top2_important_feature = model@model$variable_importances$variable[1:2]
top1_important_feature = model@model$variable_importances$variable[1]

### Train models, starting from using top 3 importand features until 80% accuracy is acchieved

tryCatch( {
  model.with.1.features<<- h2o.getModel(list_models['1-featured model',])
  model.with.2.features<<- h2o.getModel(list_models['2-featured model',])
  model.with.3.features<<- h2o.getModel(list_models['3-featured model',])
  model.with.4.features<<- h2o.getModel(list_models['4-featured model',])
  model.with.5.features<<- h2o.getModel(list_models['5-featured model',])
}, error = function(e) {
  model.with.1.features <<- h2o.randomForest(top1_important_feature, y, seed = 123, samsungData.hex, nfolds = 10)
  model.with.2.features <<- h2o.randomForest(top2_important_feature, y, seed = 123, samsungData.hex, nfolds = 10)
  model.with.3.features <<- h2o.randomForest(top3_important_feature, y, seed = 123, samsungData.hex, nfolds = 10)
  model.with.4.features <<- h2o.randomForest(top4_important_feature, y, seed = 123, samsungData.hex, nfolds = 10)
  model.with.5.features <<- h2o.randomForest(top5_important_feature, y, seed = 123, samsungData.hex, nfolds = 10)
  write(paste("1-featured model",model.with.1.features@model_id, sep="\t"), "model_id.txt", append = TRUE)
  write(paste("2-featured model",model.with.2.features@model_id, sep="\t"), "model_id.txt", append = TRUE)
  write(paste("3-featured model",model.with.3.features@model_id, sep="\t"), "model_id.txt", append = TRUE)
  write(paste("4-featured model",model.with.4.features@model_id, sep="\t"), "model_id.txt", append = TRUE)
  write(paste("5-featured model",model.with.5.features@model_id, sep="\t"), "model_id.txt", append = TRUE)
  
})
compared_top_imp_vars =matrix(nrow=5, ncol=0)
for (m in h2o.cross_validation_models(model)){
  compared_top_imp_vars = cbind(compared_top_imp_vars, h2o.varimp(m)$variable[1:5])
}
compared_top_imp_vars = cbind(compared_top_imp_vars,h2o.varimp(model)$variable[1:5])
colnames(compared_top_imp_vars) = c(1:10, "selected")
rownames(compared_top_imp_vars) = paste("var",c(1:5), sep="")
t(as.data.frame(compared_top_imp_vars))
##          var1                   var2                    
## 1        "tGravityAcc-mean()-Y" "tGravityAcc-min()-X"   
## 2        "tGravityAcc-mean()-Y" "angle(X,gravityMean)"  
## 3        "tGravityAcc-mean()-Y" "tGravityAcc-energy()-X"
## 4        "tGravityAcc-mean()-Y" "angle(X,gravityMean)"  
## 5        "tGravityAcc-mean()-Y" "angle(X,gravityMean)"  
## 6        "tGravityAcc-mean()-Y" "angle(X,gravityMean)"  
## 7        "tGravityAcc-mean()-Y" "angle(X,gravityMean)"  
## 8        "tGravityAcc-mean()-Y" "angle(X,gravityMean)"  
## 9        "tGravityAcc-mean()-Y" "angle(X,gravityMean)"  
## 10       "tGravityAcc-mean()-Y" "tGravityAcc-min()-X"   
## selected "tGravityAcc-mean()-Y" "tGravityAcc-min()-X"   
##          var3                   var4                    
## 1        "angle(X,gravityMean)" "tGravityAcc-energy()-X"
## 2        "tGravityAcc-min()-X"  "tGravityAcc-energy()-X"
## 3        "angle(X,gravityMean)" "tGravityAcc-min()-X"   
## 4        "tGravityAcc-min()-X"  "tGravityAcc-energy()-X"
## 5        "tGravityAcc-min()-X"  "tGravityAcc-energy()-X"
## 6        "tGravityAcc-min()-X"  "tGravityAcc-energy()-X"
## 7        "tGravityAcc-min()-X"  "tGravityAcc-energy()-X"
## 8        "tGravityAcc-min()-X"  "tGravityAcc-energy()-X"
## 9        "tGravityAcc-min()-X"  "tGravityAcc-energy()-X"
## 10       "angle(X,gravityMean)" "tGravityAcc-energy()-X"
## selected "angle(X,gravityMean)" "tGravityAcc-energy()-X"
##          var5                  
## 1        "tGravityAcc-min()-Z" 
## 2        "tGravityAcc-max()-Y" 
## 3        "tGravityAcc-min()-Z" 
## 4        "angle(Y,gravityMean)"
## 5        "tGravityAcc-max()-Y" 
## 6        "tGravityAcc-min()-Z" 
## 7        "angle(Y,gravityMean)"
## 8        "tGravityAcc-max()-Y" 
## 9        "tGravityAcc-max()-Y" 
## 10       "angle(Y,gravityMean)"
## selected "tGravityAcc-max()-Y"
train with variable selected from filter method
# select top features
top5_important_feature_filter_variance = names_top_5_sorted_variances
top5_important_feature_filter_abs_cv = names_top_5_sorted_abs_cvs
top4_important_feature_filter_variance = names_top_5_sorted_variances[1:4]
top4_important_feature_filter_abs_cv = names_top_5_sorted_abs_cvs[1:4]
top3_important_feature_filter_variance = names_top_5_sorted_variances[1:3]
top3_important_feature_filter_abs_cv = names_top_5_sorted_abs_cvs[1:3]

y = colnames(samsungData.hex)[ncol]
tryCatch( {
  # model.with.1.features<<- h2o.getModel(list_models['1-featured model',])
  # model.with.2.features<<- h2o.getModel(list_models['2-featured model',])
  # model.with.3.features<<- h2o.getModel(list_models['3-featured model',])
  # model.with.4.features<<- h2o.getModel(list_models['4-featured model',])
  model.with.5.features_filter_variance <<- h2o.getModel(list_models['5-featured model_filter_variance',])
    model.with.5.features_filter_abs_cv <<- h2o.getModel(list_models['5-featured model_filter_cv',])
      model.with.4.features_filter_variance <<- h2o.getModel(list_models['4-featured model_filter_variance',])
    model.with.4.features_filter_abs_cv <<- h2o.getModel(list_models['4-featured model_filter_cv',])
      model.with.3.features_filter_variance <<- h2o.getModel(list_models['3-featured model_filter_variance',])
    model.with.3.features_filter_abs_cv <<- h2o.getModel(list_models['3-featured model_filter_cv',])
    
}, error = function(e) {
  model.with.5.features_filter_variance <<- h2o.randomForest(top5_important_feature_filter_variance, y, seed = 123, samsungData.hex, nfolds = 10)
  model.with.5.features_filter_abs_cv <<- h2o.randomForest(top5_important_feature_filter_abs_cv, y, seed = 123, samsungData.hex, nfolds = 10)
    model.with.4.features_filter_variance <<- h2o.randomForest(top4_important_feature_filter_variance, y, seed = 123, samsungData.hex, nfolds = 10)
  model.with.4.features_filter_abs_cv <<- h2o.randomForest(top4_important_feature_filter_abs_cv, y, seed = 123, samsungData.hex, nfolds = 10)
    model.with.3.features_filter_variance <<- h2o.randomForest(top3_important_feature_filter_variance, y, seed = 123, samsungData.hex, nfolds = 10)
  model.with.3.features_filter_abs_cv <<- h2o.randomForest(top3_important_feature_filter_abs_cv, y, seed = 123, samsungData.hex, nfolds = 10)
  write(paste("5-featured model_filter_variance",model.with.5.features_filter_variance@model_id, sep="\t"), "model_id.txt", append = TRUE)
    write(paste("5-featured model_filter_cv",model.with.5.features_filter_abs_cv@model_id, sep="\t"), "model_id.txt", append = TRUE)
      write(paste("4-featured model_filter_variance",model.with.4.features_filter_variance@model_id, sep="\t"), "model_id.txt", append = TRUE)
    write(paste("4-featured model_filter_cv",model.with.4.features_filter_abs_cv@model_id, sep="\t"), "model_id.txt", append = TRUE)
      write(paste("3-featured model_filter_variance",model.with.3.features_filter_variance@model_id, sep="\t"), "model_id.txt", append = TRUE)
    write(paste("3-featured model_filter_cv",model.with.3.features_filter_abs_cv@model_id, sep="\t"), "model_id.txt", append = TRUE)
  
})

Accuracy

RandomForest selected models

accuracy.randForest = data.frame(0,0)
colnames(accuracy.randForest) = c('mean','sd')
model_list = list( model.with.1.features,model.with.2.features,model.with.3.features,model.with.4.features,model.with.5.features)
for (i in 1:length(model_list)){
  accuracy.randForest[i,] = as.numeric(model_list[[i]]@model$cross_validation_metrics_summary['accuracy',c('mean','sd')])
}
accuracy.randForest['all',] = as.numeric( baseline_accuracy)
accuracy.randForest
##          mean          sd
## 1   0.4983688 0.012374596
## 2   0.7921445 0.013212611
## 3   0.8808801 0.008699819
## 4   0.8851942 0.006473109
## 5   0.8941538 0.009877398
## all 0.9824840 0.003290585
Accuracy plot
# ggplot2
x  = rownames(accuracy.randForest)
y = accuracy.randForest[,1]
sd = accuracy.randForest[,2]
h = 0.8
qplot(x,y)+geom_errorbar(aes(x=x, ymin=y-sd, ymax=y+sd), width=0.25)+labs(title = "Accuracies of Models with Different Numbers of Features Used", x="number of features used", y="accuracy")+geom_hline(yintercept = h,lty = "dashed",show.legend = TRUE )+geom_text(aes("all",h,label = "threshold = 0.8", vjust = -1))

filter selected models

selected by variances
accuracy = data.frame(0,0)
colnames(accuracy) = c('mean','sd')
model_list_var = list( model.with.3.features_filter_variance,model.with.4.features_filter_variance,model.with.5.features_filter_variance)
for (i in 1:length(model_list_var)){
  accuracy[i,] = as.numeric(model_list_var[[i]]@model$cross_validation_metrics_summary['accuracy',c('mean','sd')])
}
accuracy['all',] = as.numeric( baseline_accuracy)
rownames(accuracy)[1:3] = 3:5 
accuracy
##          mean          sd
## 3   0.4952847 0.009646328
## 4   0.5765822 0.016515600
## 5   0.6213171 0.008199125
## all 0.9824840 0.003290585
  • Accuracy plot
# ggplot2
x  = rownames(accuracy)
y = accuracy[,1]
sd = accuracy[,2]
h = 0.8
qplot(x,y)+geom_errorbar(aes(x=x, ymin=y-sd, ymax=y+sd), width=0.25)+labs(title = "Accuracies of Models with Different Numbers of Most-variant Features Used", x="number of features used", y="accuracy")+geom_hline(yintercept = h,lty = "dashed",show.legend = TRUE )+geom_text(aes("all",h,label = "threshold = 0.8", vjust = -1))

selected by absolute coeffients of variation (abs cv)
accuracy = data.frame(0,0)
colnames(accuracy) = c('mean','sd')
model_list_abs_cv = list( model.with.3.features_filter_abs_cv,model.with.4.features_filter_abs_cv,model.with.5.features_filter_abs_cv)
for (i in 1:length(model_list_abs_cv)){
  accuracy[i,] = as.numeric(model_list_abs_cv[[i]]@model$cross_validation_metrics_summary['accuracy',c('mean','sd')])
}
accuracy['all',] = as.numeric( baseline_accuracy)
rownames(accuracy)[1:3] = 3:5 
accuracy
##          mean          sd
## 3   0.4630966 0.010582850
## 4   0.7972229 0.009651011
## 5   0.7953448 0.009243531
## all 0.9824840 0.003290585
  • Accuracy plot
# ggplot2
x  = rownames(accuracy)
y = accuracy[,1]
sd = accuracy[,2]
h = 0.8
qplot(x,y)+geom_errorbar(aes(x=x, ymin=y-sd, ymax=y+sd), width=0.25)+labs(title = "Accuracies of Models with Different Numbers of Highest-Abs-CV Features Used", x="number of features used", y="accuracy")+geom_hline(yintercept = h,lty = "dashed",show.legend = TRUE )+geom_text(aes("all",h,label = "threshold = 0.8", vjust = -1))

view the selected features

  • 5 features
  • classification accuracy >= 0.8
acc = accuracy.randForest[,"mean" ]
for (i in 1:length(acc)){
  this.accuracy = acc[i]
  if (this.accuracy >= 0.8) {
    num_features_selected <<- i
    break
  }
}
num_features_selected
## [1] 3
selected_model = model_list[[num_features_selected]]
selected_features = top5_important_feature[1:num_features_selected]

view the confusion matrix of model using 3 features

h2o.confusionMatrix(selected_model)
## Confusion Matrix: vertical: actual; across: predicted
##          laying sitting standing walk walkdown walkup  Error          Rate
## laying     1092       0        0    0        0      0 0.0000 =   0 / 1,092
## sitting       0     961       22   10       14      7 0.0523 =  53 / 1,014
## standing      0      29      946   51       30     28 0.1273 = 138 / 1,084
## walk          0      14       58  808       50     25 0.1539 =   147 / 955
## walkdown      0      12       21   77      554    100 0.2749 =   210 / 764
## walkup        0       8        8   26       69    715 0.1344 =   111 / 826
## Totals     1092    1024     1055  972      717    875 0.1149 = 659 / 5,735

accuracy across cv of the selected model

selected_model@model$cross_validation_metrics_summary[,c('mean', 'sd')]
##                            mean           sd
## accuracy              0.8808801  0.008699819
## err                  0.11911989  0.008699819
## err_count                  68.2     4.886717
## logloss              0.40504476  0.039050993
## max_per_class_error  0.28640023   0.04486839
## mse                 0.107486196  0.003819107
## r2                   0.96210855 0.0016113676

view details

selected_model@model$model_summary
## Model Summary: 
##   number_of_trees model_size_in_bytes min_depth max_depth mean_depth
## 1             300             1070802         1        20   16.84667
##   min_leaves max_leaves mean_leaves
## 1          2        515   303.62000
view all stdout details
selected_model
## Model Details:
## ==============
## 
## H2OMultinomialModel: drf
## Model ID:  DRF_model_R_1475356175629_4 
## Model Summary: 
##   number_of_trees model_size_in_bytes min_depth max_depth mean_depth
## 1             300             1070802         1        20   16.84667
##   min_leaves max_leaves mean_leaves
## 1          2        515   303.62000
## 
## 
## H2OMultinomialMetrics: drf
## ** Reported on training data. **
## Description: Metrics reported on Out-Of-Bag training samples
## 
## Training Set Metrics: 
## =====================
## Metrics reported on Out-Of-Bag training samples 
## 
## Extract training frame with `h2o.getFrame("samsungData_fixed-duplicated-columns_sid_9525_2")`
## MSE: (Extract with `h2o.mse`) 0.1072876
## R^2: (Extract with `h2o.r2`) 0.9622172
## Logloss: (Extract with `h2o.logloss`) 0.598621
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: vertical: actual; across: predicted
##          laying sitting standing walk walkdown walkup  Error          Rate
## laying     1092       0        0    0        0      0 0.0000 =   0 / 1,092
## sitting       0     961       22   10       14      7 0.0523 =  53 / 1,014
## standing      0      29      946   51       30     28 0.1273 = 138 / 1,084
## walk          0      14       58  808       50     25 0.1539 =   147 / 955
## walkdown      0      12       21   77      554    100 0.2749 =   210 / 764
## walkup        0       8        8   26       69    715 0.1344 =   111 / 826
## Totals     1092    1024     1055  972      717    875 0.1149 = 659 / 5,735
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-6 Hit Ratios: 
##   k hit_ratio
## 1 1  0.885092
## 2 2  0.963208
## 3 3  0.983261
## 4 4  0.989364
## 5 5  0.990235
## 6 6  1.000000
## 
## 
## 
## H2OMultinomialMetrics: drf
## ** Reported on cross-validation data. **
## Description: 10-fold cross-validation on training data (Metrics computed for combined holdout predictions)
## 
## Cross-Validation Set Metrics: 
## =====================
## 10-fold cross-validation on training data (Metrics computed for combined holdout predictions) 
## 
## Extract cross-validation frame with `h2o.getFrame("samsungData_fixed-duplicated-columns_sid_9525_2")`
## MSE: (Extract with `h2o.mse`) 0.1074331
## R^2: (Extract with `h2o.r2`) 0.962166
## Logloss: (Extract with `h2o.logloss`) 0.40561
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,xval = TRUE)`
## =======================================================================
## Top-6 Hit Ratios: 
##   k hit_ratio
## 1 1  0.881081
## 2 2  0.966173
## 3 3  0.989364
## 4 4  0.996338
## 5 5  0.997733
## 6 6  1.000000
## 
## 
## Cross-Validation Metrics Summary: 
##                            mean           sd cv_1_valid  cv_2_valid
## accuracy              0.8808801  0.008699819  0.8962433   0.8752026
## err                  0.11911989  0.008699819 0.10375671 0.124797404
## err_count                  68.2     4.886717         58          77
## logloss              0.40504476  0.039050993 0.43640986    0.500366
## max_per_class_error  0.28640023   0.04486839 0.28169015   0.3783784
## mse                 0.107486196  0.003819107 0.09968863 0.110442854
## r2                   0.96210855 0.0016113676 0.96564025   0.9613993
##                     cv_3_valid cv_4_valid cv_5_valid  cv_6_valid
## accuracy            0.88264465 0.90189326  0.8635514   0.8881789
## err                 0.11735537 0.09810671 0.13644859 0.111821085
## err_count                   71         57         73          70
## logloss              0.4445967  0.3741133 0.48623514  0.35710683
## max_per_class_error  0.2027027 0.18666667 0.32894737  0.23376623
## mse                 0.10546697 0.09904204 0.11496392 0.106322244
## r2                   0.9623406 0.96377623  0.9603755   0.9606866
##                     cv_7_valid cv_8_valid  cv_9_valid cv_10_valid
## accuracy            0.88928574 0.87523276   0.8645833  0.87198514
## err                 0.11071429 0.12476723  0.13541667  0.12801485
## err_count                   62         67          78          69
## logloss             0.39912796 0.34781325  0.36107048  0.34360805
## max_per_class_error     0.2875 0.37704918  0.26262626  0.32467532
## mse                 0.10474819 0.10700288 0.115492225    0.111692
## r2                   0.9651872 0.96346056   0.9581606   0.9600589

GLM

train with the 3 features selected from Random Forest

# y = colnames(samsungData.hex)[ncol]
# is.factor(samsungData.hex[ncol])
# #install.packages("ade4")
# library(ade4)
# y.array = acm.disjonctif(samsungData[ncol])
# colnames(y.array) = names(summary(factor(samsungData[,ncol])))
# samsungData.glm = samsungData
# samsungData.glm = samsungData.glm[,-563] 
# samsungData.glm = cbind(samsungData.glm , y.array)
# write.table(samsungData.glm, file = "samsungData.glm.csv", row.names = FALSE, col.names = TRUE, sep = "," )
# 
#   write.table(samsungData.glm[trainning_indices,], file = "samsungData.glm.train.csv", row.names = FALSE, col.names = TRUE, sep = "," )  
# 
#   write.table(samsungData.glm[test_indices,], file = "samsungData.glm.test.csv", row.names = FALSE, col.names = TRUE, sep = "," )
# 
# samsungData.hex.glm  = h2o.uploadFile("samsungData.glm.train.csv")
# 
# ### how to do glm -- > multi label !!
# glm.model.train = h2o.glm(x=top3_important_feature, y = colnames(y.array), training_frame = samsungData.hex.glm, family = "binomial")
# 
# library(nnet)
# top3_important_feature
# ind = match(top3_important_feature, colnames(samsungData))
# indexed_samsumData = samsungData
# colnames(indexed_samsumData) = make.names(colnames(samsungData), unique = TRUE)
# top3_important_feature_glm = colnames(indexed_samsumData) [ind]
# train.glm = multinom(activity ~ tGravityAcc.mean...Y + tGravityAcc.min...X +angle.X.gravityMean.,data=indexed_samsumData[trainning_indices,])
# 
# z <- summary(train.glm)$coefficients/summary(train.glm)$standard.errors
# z
# #2-tailed z test
# p <- (1 - pnorm(abs(z), 0, 1))*2
# p # how does this work?
# exp(coef(train.glm))
# head(pp <- fitted(train.glm))
# 
# d.samsungData.train = indexed_samsumData[trainning_indices,c(42,53,559,563)]
# 
# train.predicted.glm = predict(train.glm, newdata = d.samsungData.train )
# 
# ## confusion matrix
# table(train.predicted.glm, indexed_samsumData[trainning_indices,563])
# 
# # accuracy
# sum(train.predicted.glm==indexed_samsumData[trainning_indices,563])/length(train.predicted.glm)
# 
# 
# ############## don't touch this
# 
# d.samsungData.test = indexed_samsumData[test_indices,c(42,53,559,563)]
# 
# test.predicted.glm = predict(train.glm, newdata = d.samsungData.test )

Test

  • using 3 selected features
samsungData.test.hex = h2o.uploadFile(path = "samsungData_fixed-duplicated-columns.test.csv")
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
dim(samsungData.test.hex)
## [1] 1617  563
# ncol = dim(samsungData.test.hex)[2]
# x =selected_features
# y = colnames(samsungData.test.hex)[ncol]
# 
# # classification of the test data with random forest
# tryCatch( {
#   model.test<<- h2o.getModel(list_models['test_model',])
# }, error = function(e) {
#   model.test <<- h2o.randomForest(x, y, seed = 123, samsungData.test.hex)
#   write(paste("test_model",model.test@model_id, sep="\t"), "model_id.txt", append = TRUE)
# })

# prediction time
ptm <- proc.time() # start timer
test.predicted = h2o.predict(model.with.3.features, newdata = samsungData.test.hex)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
proc.time() - ptm # elapsed is the 'real' time
##    user  system elapsed 
##   0.051   0.004   1.095
# accuracy
test.accuracy = sum(test.predicted[,'predict']==samsungData.test.hex[,ncol])/dim(samsungData.test.hex[,ncol])[1]
test.accuracy
## [1] 0.6672851
# confusion matrix
table(as.matrix(test.predicted[,'predict']),as.matrix(samsungData.test.hex[,ncol]))# need as.marix because h2o's result is environment and can't be put in table
##           
##            laying sitting standing walk walkdown walkup
##   laying      315       0        0    0        0      0
##   sitting       0     229       51   21       23      0
##   standing      0      39      175   84        4     33
##   walk          0       1       48  109       20     43
##   walkdown      0       3       12   50       99     19
##   walkup        0       0        4    7       76    152

view the test results

confusion matrix
# h2o.confusionMatrix(model.test)
# test.accuracy = 1 - h2o.confusionMatrix(model.test)['Totals', 'Error']
# test.accuracy

Compare Runtime in trainning and cross-validate

  • in milliseconds
  • baseline model: 25518
  • model with 1-feature: 2995
  • model with 2-feature: 2563
  • model with 3-feature: 2310
  • model with 4-feature: 2602
  • model with 5-feature: 2611
model@model$run_time
## [1] 25518
temp = c()
for (m in h2o.cross_validation_models(model)){
  temp = c(temp, m@model$run_time)
}
print (c(mean(temp),sd(temp)))
## [1] 26984.100  4633.974
results = data.frame(0,0)
for (i in 1:length(model_list)){
  m = model_list[[i]]
  temp = c()
  for (this.m in h2o.cross_validation_models(m)){
    temp = c(temp, this.m@model$run_time)
    #print (temp)
  }
  results[i,] = c(mean(temp), sd(temp))
}
colnames(results) = c("mean", "sd")
results
##      mean        sd
## 1 25422.2 1425.9127
## 2 23431.0  840.9693
## 3 23505.9  150.2069
## 4 25315.6  633.6601
## 5 26057.9 1257.7636

Compare model sizes in trainning and cross-validate

  • in bytes
  • sizes are not significantly different
object.size(model)
## 224440 bytes
temp = c()
for (m in h2o.cross_validation_models(model)){
  temp = c(temp, object.size(m))
}
print (c(mean(temp),sd(temp)))
## [1] 200669.6000    105.7326
results = data.frame()
for (i in 1:length(model_list)){
  m = model_list[[i]]
  for (this.m in h2o.cross_validation_models(m)){
    results = rbind.data.frame(results, c(object.size(this.m), i))
  }
}

colnames(results) = c("size", "group")
results$group = factor(results$group)
results
##     size group
## 1  50728     1
## 2  50760     1
## 3  50752     1
## 4  50760     1
## 5  50616     1
## 6  50760     1
## 7  50616     1
## 8  50616     1
## 9  50616     1
## 10 50744     1
## 11 50880     2
## 12 51064     2
## 13 51056     2
## 14 51064     2
## 15 50896     2
## 16 50888     2
## 17 50880     2
## 18 50880     2
## 19 51048     2
## 20 51048     2
## 21 51144     3
## 22 51336     3
## 23 51328     3
## 24 51152     3
## 25 51152     3
## 26 51336     3
## 27 51136     3
## 28 51144     3
## 29 51328     3
## 30 51328     3
## 31 51504     4
## 32 51640     4
## 33 51504     4
## 34 51400     4
## 35 51288     4
## 36 51512     4
## 37 51344     4
## 38 51280     4
## 39 51504     4
## 40 51280     4
## 41 51816     5
## 42 52048     5
## 43 51968     5
## 44 51752     5
## 45 51752     5
## 46 51752     5
## 47 51736     5
## 48 51744     5
## 49 51968     5
## 50 51744     5
reduced_means = aggregate(x=results$size,by= list(results$group), FUN = mean)
reduced_sd = aggregate(x=results$size,by= list(results$group), FUN = sd)
reduced_results = cbind.data.frame(reduced_means$x, reduced_sd$x)
colnames(reduced_results) = c("mean", "sd")

x  = rownames(reduced_results)
y = reduced_results[,1]
sd = reduced_results[,2]
qplot(x,y)+geom_errorbar(aes(x=x, ymin=y-sd, ymax=y+sd), width=0.25)+labs(title = "Model size of models with different number of features (1-5)", x="number of features used", y="bytes")+scale_y_continuous(limits = c(min(results$size), max(results$size)))

selected_aov = aov(size ~ factor(group), data = results)
selected_aov
## Call:
##    aov(formula = size ~ factor(group), data = results)
## 
## Terms:
##                 factor(group) Residuals
## Sum of Squares        7476114    472621
## Deg. of Freedom             4        45
## 
## Residual standard error: 102.4826
## Estimated effects may be unbalanced
summary(selected_aov)
##               Df  Sum Sq Mean Sq F value Pr(>F)    
## factor(group)  4 7476114 1869028     178 <2e-16 ***
## Residuals     45  472621   10503                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
between reduced vs not reduced
full_vs_reduce = results
full_vs_reduce$group = as.character(full_vs_reduce$group )
for (m in h2o.cross_validation_models(model)){
  full_vs_reduce= rbind.data.frame(full_vs_reduce, c(object.size(m), "all"))
}
full_vs_reduce$size= as.numeric(full_vs_reduce$size )


all_means = aggregate(x=full_vs_reduce$size,by= list(full_vs_reduce$group), FUN = mean)
all_sd = aggregate(x=full_vs_reduce$size,by= list(full_vs_reduce$group), FUN = sd)
all_results = cbind.data.frame(all_means$x, all_sd$x)
colnames(all_results) = c("mean", "sd")
rownames(all_results) = all_means$Group.1

x  = rownames( all_results)
y =  all_results[,1]
sd =  all_results[,2]
qplot(x,y)+geom_errorbar(aes(x=x, ymin=y-sd, ymax=y+sd), width=0.25)+labs(title = "Model size of models with different numbers of features", x="number of features used", y="bytes")+scale_y_continuous(limits = c(min(full_vs_reduce$size), max(full_vs_reduce$size)))

## aov
full_aov = aov(size ~ factor(group), data = full_vs_reduce)
full_aov
## Call:
##    aov(formula = size ~ factor(group), data = full_vs_reduce)
## 
## Terms:
##                 factor(group)    Residuals
## Sum of Squares   186104510396       573235
## Deg. of Freedom             5           54
## 
## Residual standard error: 103.0314
## Estimated effects may be unbalanced
summary(full_aov)
##               Df    Sum Sq   Mean Sq F value Pr(>F)    
## factor(group)  5 1.861e+11 3.722e+10 3506290 <2e-16 ***
## Residuals     54 5.732e+05 1.062e+04                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Save models to local machine
h2o.saveModel(model,paste(getwd(), deparse(substitute(model)),sep="_"))
for (m in model_list){
  h2o.saveModel(m,paste(getwd(),deparse(substitute(m)), sep="_"))
}
h2o.saveModel(model.test,paste(getwd(),"model.test", sep="_"))
for (m in model_list_var){
  h2o.saveModel(m,paste(getwd(),deparse(substitute(m)), sep="_"))
}
for (m in model_list_abs_cv){
  h2o.saveModel(m,paste(getwd(),deparse(substitute(m)), sep="_"))
}

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: